Check the difference between the eGenes from Yazar et al. (2022) and eGenes identified by CSeQTL from GTEx whole blood RNA-seq data.
library(data.table)
library(readxl)
library(stringr)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(reshape2)
theme_set(theme_classic())
params = list(cs_q_cutoff=5e-3, fold_cutoff=1)
cs_q_cutoff = params$cs_q_cutoff
fold_cutoff = params$fold_cutoff
cat("cs_q_cutoff: ", cs_q_cutoff, "\n")## cs_q_cutoff: 0.005
cat("fold_cutoff: ", fold_cutoff, "\n")## fold_cutoff: 1
config = sprintf("cs_q_%.0e_fold_%.1f",
cs_q_cutoff, fold_cutoff)
config## [1] "cs_q_5e-03_fold_1.0"
CSeQTL_dir = "../../CSeQTL/data2share/"
GTEx_eqtl_file = "GTExBlood_eqtls.rds"
Yazar_2022_dir = "../../CSeQTL/Yazar_2022/"
Yazar_2022_kept_folder = "Yazar_eSNP1_50kb/"
Yazar_2022_genes = "Yazar_gene_considered.rds"fnm = sprintf("results/step3_%s_eGenes.rds", config)
eGenes_overlap_list = readRDS(fnm)
sapply(eGenes_overlap_list, function(x){
sapply(x, function(z){sapply(z, length)})})## B IN B Mem CD4 NC CD4 ET CD8 NC CD8 ET CD8 S100B Mono C Mono NC NK R NK
## [1,] 291 293 336 363 327 340 288 281 213 101 371
## [2,] 131 128 198 194 178 180 174 265 226 211 261
cs_cell_types = c("B", "CD4T", "CD8T", "Monocyte", "NK")
ot_cell_types = c("B IN", "B Mem", "CD4 NC", "CD4 ET",
"CD8 NC", "CD8 ET", "CD8 S100B", "Mono C",
"Mono NC", "NK R", "NK")
ct_groups = list(B = c("B IN", "B Mem"),
CD4T = c("CD4 NC", "CD4 ET"),
CD8T = c("CD8 NC", "CD8 ET", "CD8 S100B"),
Monocyte = c("Mono C", "Mono NC"),
NK = c("NK R", "NK"))plist = list()
for (i in 1:length(ot_cell_types)){
ot_ct = ot_cell_types[i]
ot_ct_str = sub(" ", "_", str_squish(sub("/", "", ot_ct)))
Yazar_ct_eGenes_path = paste0(Yazar_2022_dir, Yazar_2022_kept_folder,
"Yazar_eSNP1_50kb_", ot_ct_str, ".csv")
df_Yazar_ct = read.csv(Yazar_ct_eGenes_path, header = TRUE)
mat_i = match(eGenes_overlap_list[[ot_ct]][[1]]$Yazar,
df_Yazar_ct$Gene.Ensembl.ID)
stopifnot(sum(is.na(mat_i)) == 0)
df_Yazar_ct = df_Yazar_ct[mat_i,]
df_Yazar_ct$CSeQTL =
df_Yazar_ct$Gene.Ensembl.ID %in% eGenes_overlap_list[[ot_ct]][[1]]$CSeQTL
table(df_Yazar_ct$CSeQTL)
ks1 = ks.test(log10(df_Yazar_ct$dist[!df_Yazar_ct$CSeQTL] + 1),
log10(df_Yazar_ct$dist[df_Yazar_ct$CSeQTL]+1))
plist[[3*(i-1)+1]] = ggplot(df_Yazar_ct,
aes(x=CSeQTL, y=log10(dist+1), color=CSeQTL)) +
geom_violin(trim=FALSE) + geom_boxplot(width=0.1) +
ggtitle(sprintf("%s, %d vs. %d", ot_ct, sum(!df_Yazar_ct$CSeQTL),
sum(df_Yazar_ct$CSeQTL)),
subtitle=sprintf("KS test p-val=%.1e", ks1$p.value))
df_Yazar_ct$qvalue[which(df_Yazar_ct$qvalue < 1e-50)] = 1e-50
ks1 = ks.test(-log10(df_Yazar_ct$qvalue)[!df_Yazar_ct$CSeQTL],
-log10(df_Yazar_ct$qvalue)[df_Yazar_ct$CSeQTL])
plist[[3*(i-1)+2]] = ggplot(df_Yazar_ct, aes(x=CSeQTL, y=-log10(qvalue),
color=CSeQTL)) +
geom_violin(trim=FALSE) + geom_boxplot(width=0.1) +
ggtitle(label="", subtitle = sprintf("KS test p-val=%.1e", ks1$p.value))
ks1 = ks.test(-log10(df_Yazar_ct$FDR)[!df_Yazar_ct$CSeQTL],
-log10(df_Yazar_ct$FDR)[df_Yazar_ct$CSeQTL])
plist[[3*(i-1)+3]] = ggplot(df_Yazar_ct, aes(x=CSeQTL,
y=-log10(FDR),
color=CSeQTL)) +
geom_violin(trim=FALSE) + geom_boxplot(width=0.1) +
ggtitle(label="", subtitle = sprintf("KS test p-val=%.1e", ks1$p.value))
}
length(plist)## [1] 33
ggarrange(plotlist=plist[1:12], nrow = 4, ncol = 3,
common.legend=TRUE, legend="top")ggarrange(plotlist=plist[13:21], nrow = 4, ncol = 3,
common.legend=TRUE, legend="top")ggarrange(plotlist=plist[22:33], nrow = 4, ncol = 3,
common.legend=TRUE, legend="top")GTExWB <- readRDS(paste0(CSeQTL_dir, GTEx_eqtl_file))
length(GTExWB)## [1] 2
names(GTExWB)## [1] "eqtl" "summary"
lapply(GTExWB, dim)## $eqtl
## [1] 759684 18
##
## $summary
## [1] 64 13
GTExWB = GTExWB[["eqtl"]]
dim(GTExWB)## [1] 759684 18
GTExWB[1:2,]## chr ENSG GENE CELLTYPE SNP MUa
## 1 chr1 ENSG00000000457.13 SCYL3 Bulk chr1:169796199:G:A:b38 181.5319
## 2 chr1 ENSG00000000457.13 SCYL3 Bulk chr1:169891332:G:A:b38 NA
## ETA PVAL CISTRANS trim perm AF MODEL nSNP nIND
## 1 0.7871129 6.937226e-66 CIS FALSE FALSE NULL cseqtl 2969 2969
## 2 -0.2600950 6.222035e-32 <NA> FALSE FALSE NULL ols 2969 2969
## permutation_PVAL qvalue CT2
## 1 2.059662e-62 2.510377e-62 <NA>
## 2 1.847322e-28 5.945281e-28 <NA>
table(GTExWB$CELLTYPE, useNA = "ifany")##
## B Bulk CD4T CD8T Monocyte Neutrophil NK
## 94437 96138 94779 94419 94611 95492 94713
## OTHER
## 95095
GTExWB$ENSG_version = GTExWB$ENSG
GTExWB$ENSG = gsub("\\..*","", GTExWB$ENSG_version)
GTExWB = GTExWB[which((GTExWB$trim==TRUE) &
(GTExWB$perm==FALSE) &
(GTExWB$MODEL=="cseqtl") &
(!is.na(GTExWB$PVAL))), ]
plist = list()
k = 1
for (i in 1:length(cs_cell_types)){
cct = cs_cell_types[i]
df_GTEx_i = GTExWB[which(GTExWB$CELLTYPE == cct), ]
for (ct_ot in ct_groups[[cct]]){
mat_i = match(eGenes_overlap_list[[ct_ot]][[1]]$CSeQTL, df_GTEx_i$ENSG)
stopifnot(sum(is.na(mat_i)) == 0)
df_GTEx = df_GTEx_i[mat_i,]
df_GTEx$sc_eGene = df_GTEx$ENSG %in% eGenes_overlap_list[[ct_ot]][[1]]$Yazar
table(df_GTEx$sc_eGene)
ks1 = ks.test(log10(df_GTEx$MUa[!df_GTEx$sc_eGene]),
log10(df_GTEx$MUa[df_GTEx$sc_eGene]))
plist[[k]] = ggplot(df_GTEx,
aes(x=sc_eGene, y=log10(MUa), color=sc_eGene)) +
geom_violin(trim=FALSE) + geom_boxplot(width=0.1) +
ggtitle(sprintf("%s, %d vs. %d", ct_ot,
sum(!df_GTEx$sc_eGene),
sum(df_GTEx$sc_eGene)),
subtitle=sprintf("KS test p-val=%.1e", ks1$p.value))
k = k + 1
df_GTEx$qvalue[which(df_GTEx$qvalue < 1e-50)] = 1e-50
ks1 = ks.test(-log10(df_GTEx$qvalue)[!df_GTEx$sc_eGene],
-log10(df_GTEx$qvalue)[df_GTEx$sc_eGene])
plist[[k]] = ggplot(df_GTEx, aes(x=sc_eGene,
y=-log10(qvalue),
color=sc_eGene)) +
geom_violin(trim=FALSE) + geom_boxplot(width=0.1) +
ggtitle(label="", subtitle = sprintf("KS test p-val=%.1e", ks1$p.value))
k = k + 1
ks1 = ks.test(abs(log10(df_GTEx$ETA))[!df_GTEx$sc_eGene],
abs(log10(df_GTEx$ETA))[df_GTEx$sc_eGene])
plist[[k]] = ggplot(df_GTEx, aes(x=sc_eGene,
y=abs(log10(ETA)),
color=sc_eGene)) +
geom_violin(trim=FALSE) + geom_boxplot(width=0.1) +
ggtitle(label="", subtitle = sprintf("KS test p-val=%.1e", ks1$p.value))
k = k + 1
}
}
length(plist)## [1] 33
ggarrange(plotlist=plist[1:12], nrow = 4, ncol = 3,
common.legend=TRUE, legend="top")ggarrange(plotlist=plist[13:21], nrow = 4, ncol = 3,
common.legend=TRUE, legend="top")ggarrange(plotlist=plist[22:33], nrow = 4, ncol = 3,
common.legend=TRUE, legend="top")gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 1279517 68.4 3087424 164.9 NA 3087424 164.9
## Vcells 4324436 33.0 18052722 137.8 32768 22565820 172.2
sessionInfo()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reshape2_1.4.4 dplyr_1.0.7 ggpubr_0.4.0 ggplot2_3.4.0
## [5] stringr_1.4.0 readxl_1.3.1 data.table_1.14.2
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.1 xfun_0.28 bslib_0.3.1 purrr_0.3.4
## [5] carData_3.0-4 colorspace_2.0-2 vctrs_0.5.1 generics_0.1.1
## [9] htmltools_0.5.2 yaml_2.2.1 utf8_1.2.2 rlang_1.0.6
## [13] jquerylib_0.1.4 pillar_1.6.4 glue_1.6.2 withr_2.5.0
## [17] DBI_1.1.1 plyr_1.8.6 lifecycle_1.0.3 munsell_0.5.0
## [21] ggsignif_0.6.3 gtable_0.3.0 cellranger_1.1.0 evaluate_0.14
## [25] labeling_0.4.2 knitr_1.36 fastmap_1.1.0 fansi_0.5.0
## [29] highr_0.9 broom_0.7.10 Rcpp_1.0.7 scales_1.2.1
## [33] backports_1.4.0 jsonlite_1.7.2 abind_1.4-5 farver_2.1.0
## [37] gridExtra_2.3 digest_0.6.29 stringi_1.7.6 rstatix_0.7.0
## [41] cowplot_1.1.1 grid_4.1.2 cli_3.4.1 tools_4.1.2
## [45] magrittr_2.0.1 sass_0.4.0 tibble_3.1.6 crayon_1.4.2
## [49] tidyr_1.1.4 car_3.0-12 pkgconfig_2.0.3 ellipsis_0.3.2
## [53] assertthat_0.2.1 rmarkdown_2.11 rstudioapi_0.13 R6_2.5.1
## [57] compiler_4.1.2